{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python391jvsc74a57bd0b8f714736076dee7cc4e7cd002fa6517bd2bcb67ffacfab068ce43d1637242d6",
   "display_name": "Python 3.9.1 64-bit ('base': conda)"
  },
  "metadata": {
   "interpreter": {
    "hash": "b8f714736076dee7cc4e7cd002fa6517bd2bcb67ffacfab068ce43d1637242d6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class_dict = {'OBJECTIVE':0, 'METHODS':1, 'RESULTS':2, 'CONCLUSIONS':3,'BACKGROUND':4}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('result_test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x140c07730>"
      ]
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "df.groupby(['id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x140c07640>\n"
     ]
    }
   ],
   "source": [
    "print(df.groupby(['id']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "ids = list(set(df['id'].to_list()))\n",
    "test_ids = random.sample(ids,500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "500"
      ]
     },
     "metadata": {},
     "execution_count": 19
    }
   ],
   "source": [
    "len(test_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ids = [i for i in ids if i not in test_ids]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "3842"
      ]
     },
     "metadata": {},
     "execution_count": 21
    }
   ],
   "source": [
    "len(train_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "       Unnamed: 0       id         move  \\\n",
       "0               0    78587   BACKGROUND   \n",
       "1               1    78587    OBJECTIVE   \n",
       "2               2    78587      METHODS   \n",
       "3               3    78587      RESULTS   \n",
       "4               4    78587  CONCLUSIONS   \n",
       "...           ...      ...          ...   \n",
       "17723       17723  2175270  CONCLUSIONS   \n",
       "17724       17724  2175973   BACKGROUND   \n",
       "17725       17725  2175973    OBJECTIVE   \n",
       "17726       17726  2175973      METHODS   \n",
       "17727       17727  2175973      RESULTS   \n",
       "\n",
       "                                                    text  \n",
       "0      The Large Magellanic Cloud (LMC) is to date th...  \n",
       "1      We characterise the distribution and sources o...  \n",
       "2      We analyse 11 months of continuous sky-survey ...  \n",
       "3      The LMC is detected at 33 sigma significance. ...  \n",
       "4      The close correlation between cosmic-ray densi...  \n",
       "...                                                  ...  \n",
       "17723  Recent measurements of hard TPE can easily acc...  \n",
       "17724  Measurements of the Extragalactic Background L...  \n",
       "17725  At infrared wavelengths, however, these measur...  \n",
       "17726  To this purpose we exploit the effect of pair-...  \n",
       "17727  We find that, even under the most extremely fa...  \n",
       "\n",
       "[15667 rows x 4 columns]"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0</th>\n      <th>id</th>\n      <th>move</th>\n      <th>text</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>78587</td>\n      <td>BACKGROUND</td>\n      <td>The Large Magellanic Cloud (LMC) is to date th...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>78587</td>\n      <td>OBJECTIVE</td>\n      <td>We characterise the distribution and sources o...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>78587</td>\n      <td>METHODS</td>\n      <td>We analyse 11 months of continuous sky-survey ...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>78587</td>\n      <td>RESULTS</td>\n      <td>The LMC is detected at 33 sigma significance. ...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>78587</td>\n      <td>CONCLUSIONS</td>\n      <td>The close correlation between cosmic-ray densi...</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>17723</th>\n      <td>17723</td>\n      <td>2175270</td>\n      <td>CONCLUSIONS</td>\n      <td>Recent measurements of hard TPE can easily acc...</td>\n    </tr>\n    <tr>\n      <th>17724</th>\n      <td>17724</td>\n      <td>2175973</td>\n      <td>BACKGROUND</td>\n      <td>Measurements of the Extragalactic Background L...</td>\n    </tr>\n    <tr>\n      <th>17725</th>\n      <td>17725</td>\n      <td>2175973</td>\n      <td>OBJECTIVE</td>\n      <td>At infrared wavelengths, however, these measur...</td>\n    </tr>\n    <tr>\n      <th>17726</th>\n      <td>17726</td>\n      <td>2175973</td>\n      <td>METHODS</td>\n      <td>To this purpose we exploit the effect of pair-...</td>\n    </tr>\n    <tr>\n      <th>17727</th>\n      <td>17727</td>\n      <td>2175973</td>\n      <td>RESULTS</td>\n      <td>We find that, even under the most extremely fa...</td>\n    </tr>\n  </tbody>\n</table>\n<p>15667 rows × 4 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 27
    }
   ],
   "source": [
    "df[df['id'].isin(train_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = df[-df['id'].isin(test_ids)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "<ipython-input-33-4a6b13b24afc>:1: SettingWithCopyWarning: \nA value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_indexer,col_indexer] = value instead\n\nSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n  train_df['move'] = train_df['move'].map(class_dict)\n"
     ]
    }
   ],
   "source": [
    "train_df['move'] = train_df['move'].map(class_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "       Unnamed: 0       id  move  \\\n",
       "0               0    78587     4   \n",
       "1               1    78587     0   \n",
       "2               2    78587     1   \n",
       "3               3    78587     2   \n",
       "4               4    78587     3   \n",
       "...           ...      ...   ...   \n",
       "17723       17723  2175270     3   \n",
       "17724       17724  2175973     4   \n",
       "17725       17725  2175973     0   \n",
       "17726       17726  2175973     1   \n",
       "17727       17727  2175973     2   \n",
       "\n",
       "                                                    text  \n",
       "0      The Large Magellanic Cloud (LMC) is to date th...  \n",
       "1      We characterise the distribution and sources o...  \n",
       "2      We analyse 11 months of continuous sky-survey ...  \n",
       "3      The LMC is detected at 33 sigma significance. ...  \n",
       "4      The close correlation between cosmic-ray densi...  \n",
       "...                                                  ...  \n",
       "17723  Recent measurements of hard TPE can easily acc...  \n",
       "17724  Measurements of the Extragalactic Background L...  \n",
       "17725  At infrared wavelengths, however, these measur...  \n",
       "17726  To this purpose we exploit the effect of pair-...  \n",
       "17727  We find that, even under the most extremely fa...  \n",
       "\n",
       "[15667 rows x 4 columns]"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0</th>\n      <th>id</th>\n      <th>move</th>\n      <th>text</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>78587</td>\n      <td>4</td>\n      <td>The Large Magellanic Cloud (LMC) is to date th...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>78587</td>\n      <td>0</td>\n      <td>We characterise the distribution and sources o...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>78587</td>\n      <td>1</td>\n      <td>We analyse 11 months of continuous sky-survey ...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>78587</td>\n      <td>2</td>\n      <td>The LMC is detected at 33 sigma significance. ...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>78587</td>\n      <td>3</td>\n      <td>The close correlation between cosmic-ray densi...</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>17723</th>\n      <td>17723</td>\n      <td>2175270</td>\n      <td>3</td>\n      <td>Recent measurements of hard TPE can easily acc...</td>\n    </tr>\n    <tr>\n      <th>17724</th>\n      <td>17724</td>\n      <td>2175973</td>\n      <td>4</td>\n      <td>Measurements of the Extragalactic Background L...</td>\n    </tr>\n    <tr>\n      <th>17725</th>\n      <td>17725</td>\n      <td>2175973</td>\n      <td>0</td>\n      <td>At infrared wavelengths, however, these measur...</td>\n    </tr>\n    <tr>\n      <th>17726</th>\n      <td>17726</td>\n      <td>2175973</td>\n      <td>1</td>\n      <td>To this purpose we exploit the effect of pair-...</td>\n    </tr>\n    <tr>\n      <th>17727</th>\n      <td>17727</td>\n      <td>2175973</td>\n      <td>2</td>\n      <td>We find that, even under the most extremely fa...</td>\n    </tr>\n  </tbody>\n</table>\n<p>15667 rows × 4 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 34
    }
   ],
   "source": [
    "train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}